Road safety rules and regulations are designed to prevent the citizens from fatal incidents. Although policies are in place, we observe negligent behaviour of the drivers which lead to serious injuries or death crashes. It is of utmost interest of the authorities to understand and analyse human behaviour to take necessary corrective and preventive actions.
From the given data (i.e the vehicle details, the weather details and the vehicle travelling details) we need to predict Driving Style of the particular Driver whether it is "Agressive", "Normal" or "Vague
In ths particular problem we are intersted in finding the Aggressive Drivers as It may be benificial for stakeholders. The stakeholders are :
Research/ Data Service Providers
It is very important for the business that there can be false positive but if we predict false negative (i.e we predict Aggressive Drivers as Non Aggressive) then it will impact the Stakeholders, thus we have to improve the Recall value
From the given data (i.e the vehicle details, the weather details and the vehicle travelling details) we need to predict Driving Style of the particular Driver whether it is "Agressive", "Normal" or "Vague. These patterns are pre identified in the datset given and one has to infer the style for new datasets by prediction.
As we have understood from the business that, the model built has to be tuned for Recall as False -ves are a strict no. Also have to provide top 10 patterns to identify Aggresive driving Style.
# Importing Necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.offline as pyoff
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
%matplotlib inline
#Importing lib for Modelbuilding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
init_notebook_mode(connected=True)
train_data = pd.read_csv('Train.csv')
train_vehicletravelling_data = pd.read_csv('Train_Vehicletravellingdata.csv')
train_weather_data =pd.read_csv('Train_WeatherData.csv')
train_data.columns = ['ID','Vehicle_len','Vehicle_Wt','Num_Axle','DrivingStyle']
train_vehicletravelling_data.columns = ['ID','date_time_trip','road_lane','vehicle_speed','pre_vehicle_ID','pre_vehicle_speed','pre_vehicle_wt','pre_vehicle_len','time_gap_pre_vehicle','road_condition']
train_weather_data.columns = ['ID','date_time_trip','air_temp','precipitation_type','precipitation_intensity','rel_humidity','wind_direction','wind_speed','lighting_condition']
print(train_data.shape)
print(train_weather_data.shape)
print(train_vehicletravelling_data.shape)
train_vehicletravelling_weather = pd.merge(train_vehicletravelling_data, train_weather_data, how='inner', on=['ID','date_time_trip'])
train = pd.merge(train_vehicletravelling_weather,train_data, how='outer', on=['ID'])
train.columns
train.shape
train_driving_style = train.DrivingStyle.replace({1: 'Aggressive', 2: 'Normal', 3: 'Vague'})
train.head()
train.describe().transpose()
train.shape
Tried both approaches attaching both files
train.isnull().sum()
plt.figure(figsize=(10,10))
train.isnull().sum().plot(kind='bar',
figsize=(10,6),
color="blue",
alpha = 0.7,
fontsize=13)
plt.title("NA's (in numbers)")
plt.show()
target = train["DrivingStyle"]
train_without_target = train.drop(["DrivingStyle"],axis=1)
corr = train_without_target.corr()
f, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(corr,cmap='coolwarm',vmax=.3)
# plt.figure(figsize=(10,10)
train.DrivingStyle.value_counts().plot(kind='bar',
figsize=(10,6),
color="red",
alpha = 0.7,
fontsize=13)
plt.title('Driving Style (in numbers)')
plt.xlabel('Driving Style')
plt.ylabel('Count')
plt.show()
def generate_layout_bar(col_name):
layout_bar = go.Layout(
autosize=False, # auto size the graph? use False if you are specifying the height and width
width=800, # height of the figure in pixels
height=600, # height of the figure in pixels
title = "Distribution of {} column".format(col_name), # title of the figure
# more granular control on the title font
titlefont=dict(
family='Courier New, monospace', # font family
size=14, # size of the font
color='black' # color of the font
),
# granular control on the axes objects
xaxis=dict(
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the x axis
color='black' # color of the font
)
),
yaxis=dict(
# range=[0,100],
title='Percentage',
titlefont=dict(
size=14,
color='black'
),
tickfont=dict(
family='Courier New, monospace',
size=14,
color='black'
)
),
font = dict(
family='Courier New, monospace',
color = "white",
size = 12
)
)
return layout_bar
def plot_bar(col_name):
# create a table with value counts
temp = train[col_name].value_counts()
# creating a Bar chart object of plotly
data = [go.Bar(
x=temp.index.astype(str), # x axis values
y=np.round(temp.values.astype(float)/temp.values.sum(),4)*100, # y axis values
text = ['{}%'.format(i) for i in np.round(temp.values.astype(float)/temp.values.sum(),4)*100],
textposition = 'auto',
marker = dict(color = '#0047AB'),)]
layout_bar = generate_layout_bar(col_name=col_name)
fig = go.Figure(data=data, layout=layout_bar)
return iplot(fig)
sns.catplot(x="road_lane", kind="count", palette="ch:.25", data=train);
Two lanes on the road and the traffic is equal on both the lanes
sns.catplot(x="road_condition", kind="count", palette="ch:.30", data=train);
Road Condition has four labels Dry, Wet, Snow Covered and Visible Tracks
sns.catplot(x="lighting_condition", kind="count", palette="ch:.30", data=train);
plot_bar('precipitation_type')
Most of the time 93% weather is clear according to the data
plot_bar('precipitation_intensity')
#Replacing the blank category to a new Category
train.precipitation_intensity = train.precipitation_intensity.replace(to_replace=' ', value='NotKnown')
plot_bar('lighting_condition')
Lighting Condition is mostly Night time i.e. the data that is collected is mostly during Night Time
plot_bar('Num_Axle')
Most of the vehicles have 2 axles
def plotHistogramGraph(col_name):
data = [go.Histogram(x=train[col_name],
marker=dict(
color='#CC0E1D',
))]
layout = go.Layout(title = "Histogram of {}".format(col_name))
fig = go.Figure(data= data, layout=layout)
iplot(fig)
plotHistogramGraph('vehicle_speed')
Mostly vehicle speed is between 70 to 100
plotHistogramGraph('wind_direction')
The Wind direction is mostly between 160 to 190 degrees
train.air_temp.sample(frac =0.1)
plt.figure(figsize=(15,10))
sns.countplot(x='air_temp', data=train)
The Min temp is -13 degree and Max is 24 degree. which may mean that the data set belongs to colder region
DrivingStyle_dict = {1:'1',
2:'2',
3:'3',
}
train = train.replace({"DrivingStyle":DrivingStyle_dict })
driving_air_temp = train.groupby(['DrivingStyle','air_temp']).size().to_frame()
driving_air_temp = driving_air_temp.reset_index()
driving_air_temp.columns = ['DrivingStyle','air_temp','Count']
np.random.seed(123)
driving_air_temp.sample(frac =0.1)
trace1 = go.Bar(x = driving_air_temp.air_temp[driving_air_temp.DrivingStyle=='1'],
y = driving_air_temp.Count[driving_air_temp.DrivingStyle=='1'],
text = driving_air_temp.Count[driving_air_temp.DrivingStyle=='1'],
textposition = 'auto',
name = 'Aggressive')
trace2 = go.Bar(x = driving_air_temp.air_temp[driving_air_temp.DrivingStyle=='2'],
y = driving_air_temp.Count[driving_air_temp.DrivingStyle=='2'],
text = driving_air_temp.Count[driving_air_temp.DrivingStyle=='2'],
textposition = 'auto',
name = 'Normal')
trace3 = go.Bar(x = driving_air_temp.air_temp[driving_air_temp.DrivingStyle=='3'],
y = driving_air_temp.Count[driving_air_temp.DrivingStyle=='3'],
text = driving_air_temp.Count[driving_air_temp.DrivingStyle=='3'],
textposition = 'auto',
name = 'Vague')
data = [trace1, trace2, trace3]
layout = go.Layout(width = 1000,
# barmode='stack',
height = 600,title = 'Air Temperature and Driving Style',
xaxis = dict(title = 'Air Temperature'),
yaxis = dict(title = 'Counts',range=[0, driving_air_temp.Count+10]))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
# train.precipitation_type
driving_prec_veh = train.groupby(['DrivingStyle','precipitation_type']).size().to_frame()
driving_prec_veh = driving_prec_veh.reset_index()
driving_prec_veh.columns = ['DrivingStyle','precipitation_type','Count']
trace1 = go.Bar(x = driving_prec_veh.precipitation_type[driving_prec_veh.DrivingStyle=='1'],
y = driving_prec_veh.Count[driving_prec_veh.DrivingStyle=='1'],
text = driving_prec_veh.Count[driving_prec_veh.DrivingStyle=='1'],
textposition = 'auto',
name = 'Aggressive')
trace2 = go.Bar(x = driving_prec_veh.precipitation_type[driving_prec_veh.DrivingStyle=='2'],
y = driving_prec_veh.Count[driving_prec_veh.DrivingStyle=='2'],
text = driving_prec_veh.Count[driving_prec_veh.DrivingStyle=='3'],
textposition = 'auto',
name = 'Normal')
trace3 = go.Bar(x = driving_prec_veh.precipitation_type[driving_prec_veh.DrivingStyle=='3'],
y = driving_prec_veh.Count[driving_prec_veh.DrivingStyle=='3'],
text = driving_prec_veh.Count[driving_prec_veh.DrivingStyle=='3'],
textposition = 'auto',
name = 'Vague')
data = [trace1, trace2, trace3]
layout = go.Layout(width = 1000,
# barmode='stack',
height = 600,title = 'Driving Style with Precipitation Type',
xaxis = dict(title = 'Precipitation Type'),
yaxis = dict(title = 'Counts',range=[0, driving_prec_veh.Count.max()+10]))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Some people drive aggressively inspite of rain and snow
Driving_Road_Condition = train.groupby(['DrivingStyle','road_condition']).size().to_frame()
Driving_Road_Condition = Driving_Road_Condition.reset_index()
Driving_Road_Condition.columns = ['DrivingStyle','road_condition','Count']
trace1 = go.Bar(x = Driving_Road_Condition.road_condition[Driving_Road_Condition.DrivingStyle=='1'],
y = Driving_Road_Condition.Count[Driving_Road_Condition.DrivingStyle=='1'],
text = Driving_Road_Condition.Count[Driving_Road_Condition.DrivingStyle=='1'],
textposition = 'auto',
name = 'Aggresive')
trace2 = go.Bar(x = Driving_Road_Condition.road_condition[Driving_Road_Condition.DrivingStyle=='2'],
y = Driving_Road_Condition.Count[Driving_Road_Condition.DrivingStyle=='2'],
text = Driving_Road_Condition.Count[Driving_Road_Condition.DrivingStyle=='2'],
textposition = 'auto',
name = 'Normal')
trace3 = go.Bar(x = Driving_Road_Condition.road_condition[Driving_Road_Condition.DrivingStyle=='3'],
y = Driving_Road_Condition.Count[Driving_Road_Condition.DrivingStyle=='3'],
text = Driving_Road_Condition.Count[Driving_Road_Condition.DrivingStyle=='3'],
textposition = 'auto',
name = 'Vague')
data = [trace1, trace2, trace3]
layout = go.Layout(width = 1000,
# barmode='stack',
height = 600,title = 'Driving Style with Road Condition',
xaxis = dict(title = 'Road Condition'),
yaxis = dict(title = 'Counts',range=[0, Driving_Road_Condition.Count.max()+10]))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
Most of the Drivers drive Normal way But there are few people who Inspite of Wet and Snow on the Roads Drive Aggressively
Driving_Lighting_Condition = train.groupby(['DrivingStyle','lighting_condition']).size().to_frame()
Driving_Lighting_Condition = Driving_Lighting_Condition.reset_index()
Driving_Lighting_Condition.columns = ['DrivingStyle','lighting_condition','Count']
trace1 = go.Bar(x = Driving_Lighting_Condition.lighting_condition[Driving_Lighting_Condition.DrivingStyle=='1'],
y = Driving_Lighting_Condition.Count[Driving_Lighting_Condition.DrivingStyle=='1'],
text = Driving_Lighting_Condition.Count[Driving_Lighting_Condition.DrivingStyle=='1'],
textposition = 'auto',
name = 'Aggresive')
trace2 = go.Bar(x = Driving_Lighting_Condition.lighting_condition[Driving_Lighting_Condition.DrivingStyle=='2'],
y = Driving_Lighting_Condition.Count[Driving_Lighting_Condition.DrivingStyle=='2'],
text = Driving_Lighting_Condition.Count[Driving_Lighting_Condition.DrivingStyle=='2'],
textposition = 'auto',
name = 'Normal')
trace3 = go.Bar(x = Driving_Lighting_Condition.lighting_condition[Driving_Lighting_Condition.DrivingStyle=='3'],
y = Driving_Lighting_Condition.Count[Driving_Lighting_Condition.DrivingStyle=='3'],
text = Driving_Lighting_Condition.Count[Driving_Lighting_Condition.DrivingStyle=='3'],
textposition = 'auto',
name = 'Vague')
data = [trace1, trace2, trace3]
layout = go.Layout(width = 1000,
# barmode='stack',
height = 600,title = 'Driving Style with Lighting Condition',
xaxis = dict(title = 'Lighting Condition'),
yaxis = dict(title = 'Counts',range=[0, Driving_Lighting_Condition.Count.max()+10]))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
At Night Time Number of people driving Aggressive are more compared to other Time of the day and also at Night people drive more Vaguely
date =[]
time = []
for i in train.date_time_trip:
a = str(i).split()
date.append(str(a[0]))
time.append(str(a[1]))
train['date']=date
train['time']=time
day = []
month = []
year = []
hour = []
for i in train.date:
a = str(i).split('-')
year.append(str(a[0]))
month.append(str(a[1]))
day.append(str(a[2]))
for i in train.time:
a = str(i).split(':')
hour.append(str(a[0]))
train['day'] = day
train['month'] = month
train['year'] = year
train['hour'] = hour
Driving_Hour = train.groupby(['DrivingStyle','hour']).size().to_frame()
Driving_Hour = Driving_Hour.reset_index()
Driving_Hour.columns = ['DrivingStyle','Hour','Count']
trace1 = go.Bar(x = Driving_Hour.Hour[Driving_Hour.DrivingStyle=='1'],
y = Driving_Hour.Count[Driving_Hour.DrivingStyle=='1'],
text = Driving_Hour.Count[Driving_Hour.DrivingStyle=='1'],
textposition = 'auto',
name = 'Aggresive')
trace2 = go.Bar(x = Driving_Hour.Hour[Driving_Hour.DrivingStyle=='2'],
y = Driving_Hour.Count[Driving_Hour.DrivingStyle=='2'],
text = Driving_Hour.Count[Driving_Hour.DrivingStyle=='2'],
textposition = 'auto',
name = 'Normal')
trace3 = go.Bar(x = Driving_Hour.Hour[Driving_Hour.DrivingStyle=='3'],
y = Driving_Hour.Count[Driving_Hour.DrivingStyle=='3'],
text = Driving_Hour.Count[Driving_Hour.DrivingStyle=='3'],
textposition = 'auto',
name = 'Vague')
data = [trace1, trace2, trace3]
layout = go.Layout(width = 1000,
# barmode='stack',
height = 600,title = 'Driving Style with Hour',
xaxis = dict(title = 'Hour'),
yaxis = dict(title = 'Counts',range=[0, Driving_Hour.Count.max()+10]))
fig = go.Figure(data=data, layout=layout)
iplot(fig)
There were very few people who drove between 12 pm to 6 am and comparitivele the percentage of people driving agressively was too high Between 2pm to 12am people people drive more vaguely
plot_bar('month')
No data for the month of June to October
temp = train.sample(frac=0.01)
X = temp.vehicle_speed
Y = temp.pre_vehicle_speed
plt.figure(figsize=(15,10))
plt.xlabel("Vehicle speed")
plt.ylabel("Preceeding Vehicle speed")
plt.title("Vehicle speed vs Preceeding Vehicle speed")
plt.scatter(X,Y)
plt.show()
The speed of preceeding Vehicle and the current vehicle are almost same
sns.catplot(x="road_condition", y="month",data=train,);
No Snow in the month of May and November
# temp = train.sample(frac=0.01)
# sns.catplot(x="DrivingStyle", y="vehicle_speed",col="road_condition", aspect=.6, kind="swarm", data=temp);
# train.Num_Axle
temp = train.sample(frac=0.02)
sns.catplot(x="vehicle_speed", y="Num_Axle",col="DrivingStyle", kind="swarm", data=temp);
train.head()
train.columns
train.dtypes
train = train.drop(['date_time_trip','date','time','day','year','pre_vehicle_ID','month','hour'],axis=1)
train.columns
train.dtypes
If the unique value are less that 10 in a column assigning it as categorical variable else numerical variable
cat_cols =[]
num_cols =[]
for i in train.columns:
if (len(np.unique(train[i])) < 10) & (i != 'month'):
cat_cols.append(i)
print("{} : {} : {} ".format(i,len(np.unique(train[i])),np.unique(train[i])))
else:
num_cols.append(i)
ID = 'ID'
cat_cols.append(ID)
for i in cat_cols:
train[i] = train[i].astype('category')
num_cols
train.isna().sum()
train.air_temp.fillna(train.air_temp.mean(), inplace=True)
train.wind_speed.fillna(train.wind_speed.mean(), inplace=True)
train.time_gap_pre_vehicle.fillna(train.time_gap_pre_vehicle.mean(), inplace=True)
train.wind_direction.fillna(train.wind_direction.mean(), inplace=True)
train.rel_humidity.fillna(train.rel_humidity.mean(), inplace=True)
np.unique(train.precipitation_intensity)
train_data_cat_aggregated =train[cat_cols].groupby(['ID'], sort=False).max()
train_data_cat = train_data_cat_aggregated.reset_index()
train_data_cat = train_data_cat.rename(columns = {'index':'ID'})
train_data_cat.head(4)
print(train_data_cat.shape)
train_data_cat.columns
num_cols
train_data_num_aggregated =train[num_cols].groupby(['ID'], sort=False).mean()
train_data_num = train_data_num_aggregated.reset_index()
train_data_num.head()
train_data_num.shape
print(train_data_num.shape)
train_data_num.columns
aggr_train_data = pd.merge(train_data_num,train_data_cat, how='outer', on=['ID'])
print(aggr_train_data.shape)
aggr_train_data.head()
target = aggr_train_data['DrivingStyle']
trainWithoutTarget = aggr_train_data.drop(['DrivingStyle','ID'],axis=1)
trainWithoutTarget.dtypes
train_dummified = pd.get_dummies(trainWithoutTarget)
train_dummified.dtypes
test_data = pd.read_csv('Test.csv')
test_vehicletravelling_data = pd.read_csv('Test_Vehicletravellingdata.csv')
test_weather_data =pd.read_csv('Test_WeatherData.csv')
test_data.columns = ['ID','Vehicle_len','Vehicle_Wt','Num_Axle']
test_vehicletravelling_data.columns = ['ID','date_time_trip','road_lane','vehicle_speed','pre_vehicle_ID','pre_vehicle_speed','pre_vehicle_wt','pre_vehicle_len','time_gap_pre_vehicle','road_condition']
test_weather_data.columns = ['ID','date_time_trip','air_temp','precipitation_type','precipitation_intensity','rel_humidity','wind_direction','wind_speed','lighting_condition']
test_vehicletravelling_weather = pd.merge(test_vehicletravelling_data, test_weather_data, how='inner', on=['ID','date_time_trip'])
test = pd.merge(test_vehicletravelling_weather,test_data, how='outer', on=['ID'])
test = test.drop(['date_time_trip','pre_vehicle_ID'],axis=1)
# date_test =[]
# time_test = []
# for i in test.date_time_trip:
# a = str(i).split()
# date_test.append(str(a[0]))
# time_test.append(str(a[1]))
# test['date']=date_test
# test['time']=time_test
# month_test = []
# hour_test = []
# for i in test.date:
# a = str(i).split('-')
# month_test.append(str(a[1]))
# for j in test.time:
# b = str(j).split(':')
# hour_test.append(str(b[0]))
# test['month'] = month_test
# test['hour'] = hour_test
If the unique value are less that 10 in a column assigning it as categorical variable else numerical variable
cat_cols_test =[]
num_cols_test =[]
for i in test.columns:
if (len(np.unique(test[i])) < 10) :
cat_cols_test.append(i)
print("{} : {} : {} ".format(i,len(np.unique(test[i])),np.unique(test[i])))
else:
num_cols_test.append(i)
ID = 'ID'
cat_cols_test.append(ID)
for i in cat_cols_test:
test[i] = test[i].astype('category')
test.dtypes
# test[datetimecols] = test[datetimecols].apply(pd.to_numeric)
# test.dtypes
num_cols_test
test.isna().sum()
test.time_gap_pre_vehicle.fillna(test.time_gap_pre_vehicle.mean(), inplace=True)
test.air_temp.fillna(test.air_temp.mean(), inplace=True)
test.rel_humidity.fillna(test.rel_humidity.mean(), inplace=True)
test.wind_direction.fillna(test.wind_direction.mean(), inplace=True)
test.wind_speed.fillna(test.wind_speed.mean(), inplace=True)
np.unique(test.precipitation_intensity)
test.precipitation_intensity = test.precipitation_intensity.replace(to_replace=' ', value='NotKnown').astype('category')
test_data_cat_aggregated =test[cat_cols_test].groupby(['ID'], sort=False).max()
test_data_cat = test_data_cat_aggregated.reset_index()
test_data_cat = test_data_cat.rename(columns = {'index':'ID'})
test_data_cat.head(4)
test_data_cat.dtypes
test_data_num_aggregated =test[num_cols_test].groupby(['ID'], sort=False).mean()
test_data_num = test_data_num_aggregated.reset_index()
print(test_data_num.shape)
test_data_num.head()
test_data_num.columns
test_data_final = pd.merge(test_data_num,test_data_cat ,on=['ID'],how='outer')
test_data_final = test_data_final.drop(['ID'],axis=1)
# test_data_ID.dtypes
test_dummified = pd.get_dummies(test_data_final)
train_dummified.dtypes
test_dummified.dtypes
test_dummified.shape
train_dummified.shape
# test_dummified['Num_Axle_8'] = 0
test_dummified.head(4)
test.isna().sum()
train_dummified.head()
import random
random.seed(1234)
x_train, x_val, y_train, y_val = train_test_split(train_dummified, target, stratify=target, test_size=0.25)
x_train.head()
logistic_model = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial')
logistic_model.fit(x_train, y_train)
# logistic_model.
predictions_log_model_train = logistic_model.predict(x_train)
predictions_log_model_val = logistic_model.predict(x_val)
print(recall_score(y_train,predictions_log_model_train,labels=[1],average='macro'))
print(recall_score(y_val,predictions_log_model_val,labels=[1],average='macro'))
#### Decision Tree
from sklearn.model_selection import GridSearchCV
parameters = {'max_depth':[7],'min_samples_split' : [10],'criterion':['gini']}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=4,cv=5)
clf.fit(X=x_train, y=y_train)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)
predictions_tree_model_train = tree_model.predict(x_train)
predictions_tree_model_val = tree_model.predict(x_val)
print(recall_score(y_train,predictions_tree_model_train,labels=[1],average='macro'))
print(recall_score(y_val,predictions_tree_model_val,labels=[1],average='macro'))
rf = RandomForestClassifier(n_estimators = 1500, n_jobs=-1,random_state = 42)
rf.fit(x_train, y_train)
predictions_rf_train = rf.predict(x_train)
predictions_rf_val = rf.predict(x_val)
print(recall_score(y_train,predictions_rf_train,labels=[1],average='macro'))
print(recall_score(y_val,predictions_rf_val,labels=[1],average='macro'))
from sklearn.model_selection import RandomizedSearchCV
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
pprint(random_grid)
rf = RandomForestClassifier(random_state = 42)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, cv = 3, verbose=2, random_state=42)
rf_random.fit(x_train, y_train)
best_rf1= rf_random.best_estimator_
best_rf_features = rf_random.best_params_
predictions_bestrf_train1 = best_rf1.predict(x_train)
predictions_bestrf_val1 = best_rf1.predict(x_val)
print(recall_score(y_train,predictions_bestrf_train1,labels=[1],average='macro'))
print(recall_score(y_val,predictions_bestrf_val1,labels=[1],average='macro'))
predictions_bestrf_tst = best_rf1.predict(test_dummified)
test_data_ID = test_data_num.ID
rf_grid2= pd.DataFrame({'ID':test_data_ID})
rf_grid2['DrivingStyle'] = predictions_bestrf_tst
#pd.DataFrame(rf_grid3).to_csv("anita_rf_grid12.csv")
learning_rates = [0.05, 0.1, 0.25, 0.5]
for learning_rate in learning_rates:
gb = GradientBoostingClassifier(n_estimators=125, learning_rate = learning_rate,min_samples_split=4, max_features=None, max_depth = 3, random_state = 20)
gb.fit(x_train, y_train)
predictions_gb_train = gb.predict(x_train)
predictions_gb_val = gb.predict(x_val)
print(recall_score(y_train,predictions_gb_train,labels=[1],average='macro'))
print(recall_score(y_val,predictions_gb_val,labels=[1],average='macro'))
xgb_model = GradientBoostingClassifier(n_estimators=125, learning_rate = 0.1,min_samples_split=4, max_features=None, max_depth = 3, random_state = 20)
kfold = StratifiedKFold(n_splits=5, random_state=7)
results = cross_val_score(xgb_model, x_train, y_train, cv=kfold)
predictions_gb_train = gb.predict(x_train)
predictions_gb_val = gb.predict(x_val)
print(recall_score(y_train,predictions_gb_train,labels=[1],average='macro'))
print(recall_score(y_val,predictions_gb_val,labels=[1],average='macro'))
#### XGB
xgbm = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(x_train, y_train)
pr_train = xgbm.predict(x_train)
pr_val = xgbm.predict(x_val)
print(recall_score(y_train,pr_train,labels=[1],average='macro'))
print(recall_score(y_val,pr_val,labels=[1],average='macro'))
Used in all the above model but it was not useful in increasing the recall rate
mlp = MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=500, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=50,
shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
verbose=False, warm_start=True)
mlp.fit(x_train, y_train)
mlp_pred_train = mlp.predict(x_train)
rec_score_train = recall_score(y_train,mlp_pred_train,labels=[1],average='macro')
mlp_pred_val = mlp.predict(x_val)
rec_score_test = recall_score(y_val, mlp_pred_val, labels=[1],average='macro')
print(rec_score_train)
print(rec_score_test)
# mlp = MLPClassifier(random_state=12)
# mlp.fit(x_train, y_train)
# mlp_pred_train = mlp.predict(x_train)
# rec_score_train = recall_score(y_train,mlp_pred_train,labels=[1],average='macro')
# print(434)
# print(rec_score_train)
# mlp_pred_val = mlp.predict(x_val)
# rec_score_test = recall_score(y_val, mlp_pred_val, labels=[1],average='macro')
# print(rec_score_test)
def tree_to_pseudo(tree, feature_names):
left = tree.tree_.children_left
right = tree.tree_.children_right
threshold = tree.tree_.threshold
features = [feature_names[i] for i in tree.tree_.feature]
value = tree.tree_.value
def recurse(left, right, threshold, features, node, depth=0):
indent = " " * depth
if (threshold[node] != -2):
print(indent,"if ( " + features[node] + " <= " + str(threshold[node]) + " ) {")
if left[node] != -1:
recurse (left, right, threshold, features, left[node], depth+1)
print(indent,"} else {")
if right[node] != -1:
recurse (left, right, threshold, features, right[node], depth+1)
print(indent,"}")
else:
print(indent,"return " + str(value[node]))
recurse(left, right, threshold, features, 0)
tree_to_pseudo(tree_model,train_dummified.columns)
4.
6.
7.
8.
9.
10.
import os
os.environ["PATH"] += os.pathsep + 'C:/Users/ADMIN/Anaconda3/Library/bin/graphviz'
train_dummified.columns
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(tree_model, out_file=dot_data, filled=True, rounded=True, special_characters=True,class_names=['Aggressive', 'Normal', 'Vague'],rotate=False,feature_names=train_dummified.columns)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
Neural Network model performed better with 96% recall on test data Decision tree gave 70 % Recall on test data Random forest gave 80 % on train and validation data but gave 53% on test data